NYC Traffic Accidents - Exploratory Data Analysis (Python)¶

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

# pio.renderers.default = "notebook_connected"

clean_path = "../data/nyc_traffic_accidents_clean.csv"
df = pd.read_csv(clean_path)
df.shape, df.dtypes.head()
Out[1]:
((74881, 28),
 crash_date     object
 borough        object
 zip_code      float64
 latitude      float64
 longitude     float64
 dtype: object)

1. Monthly Trends¶

In [2]:
monthly = (
    df.groupby(["month", "month_name"])
      .agg(
          total_crashes=("collision_id", "count"),
          fatal_crashes=("is_fatal", "sum")
      )
      .reset_index()
      .sort_values("month")
)
total_fatal_crashes = sum(monthly["fatal_crashes"])
monthly["fatal_rate_pct"] = round((monthly["fatal_crashes"] / total_fatal_crashes) * 100, 2)
monthly
Out[2]:
month month_name total_crashes fatal_crashes fatal_rate_pct
0 1 January 14287 17 12.59
1 2 February 13684 20 14.81
2 3 March 11057 8 5.93
3 4 April 4116 13 9.63
4 5 May 6149 12 8.89
5 6 June 7616 30 22.22
6 7 July 9225 20 14.81
7 8 August 8747 15 11.11
In [3]:
fig = go.Figure()

# Bars: total crashes
fig.add_trace(go.Bar(
    x=monthly["month_name"],
    y=monthly["total_crashes"],
    name="Total Crashes"
))

# Line: fatality rate
fig.add_trace(go.Scatter(
    x=monthly["month_name"],
    y=monthly["fatal_rate_pct"],
    name="Share of Fatal Crashes (%)",
    mode="lines+markers",
    yaxis="y2"
))

fig.update_layout(
    title="Monthly Collision Trends (Jan-Aug 2020)",
    xaxis_title="Month",
    yaxis_title="Number of Crashes",
    yaxis2=dict(
        title="Share (%)",
        overlaying="y",
        side="right"
    ),
    barmode="group",
    template="plotly_white"
)

fig.show()

2. Day-of-Week Analysis¶

In [4]:
weekday_order = [
    "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"
]

df["day_of_week"] = pd.Categorical(
    df["day_of_week"],
    categories=weekday_order,
    ordered=True
)

dow = (
    df.groupby("day_of_week", observed=True)
      .agg(
          total_crashes=("collision_id", "count"),
          fatal_crashes=("is_fatal", "sum")
      )
      .reset_index()
      .sort_values("day_of_week")
)

dow["fatal_rate_pct"] = round((dow["fatal_crashes"] / total_fatal_crashes) * 100, 2)
dow
Out[4]:
day_of_week total_crashes fatal_crashes fatal_rate_pct
0 Sunday 9003 22 16.30
1 Monday 10511 16 11.85
2 Tuesday 10613 25 18.52
3 Wednesday 10638 15 11.11
4 Thursday 11244 13 9.63
5 Friday 12271 16 11.85
6 Saturday 10601 28 20.74
In [5]:
fig = go.Figure()

# Bars: total crashes
fig.add_trace(go.Bar(
    x=dow["day_of_week"],
    y=dow["total_crashes"],
    name="Total Crashes"
))

# Line: fatality rate
fig.add_trace(go.Scatter(
    x=dow["day_of_week"],
    y=dow["fatal_rate_pct"],
    name="Share of Fatal Crashes (%)",
    mode="lines+markers",
    yaxis="y2"
))

fig.update_layout(
    title="Crashes by Day of Week",
    xaxis_title="Day of Week",
    yaxis_title="Number of Crashes",
    yaxis2=dict(
        title="Share (%)",
        overlaying="y",
        side="right"
    ),
    template="plotly_white"
)

fig.show()

3. Hour-of-Day Patterns¶

In [6]:
hourly = (
    df.groupby("hour")
      .agg(
          total_crashes=("collision_id", "count"),
          fatal_crashes=("is_fatal", "sum")
      )
      .reset_index()
      .sort_values("hour")
)

hourly.head()
Out[6]:
hour total_crashes fatal_crashes
0 0 2948 7
1 1 1474 7
2 2 1139 2
3 3 989 7
4 4 975 7
In [7]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=hourly["hour"],
    y=hourly["total_crashes"],
    name="Total Crashes",
    mode="lines+markers"
))

fig.update_layout(
    title="Crashes by Hour of Day",
    xaxis=dict(
        title="Hour of Day",
        dtick=1,
        tickvals=hourly["hour"]
    ),
    yaxis_title="Total Crashes",
    template="plotly_white"
)

fig.show()
In [8]:
weekday_df = df[~df["is_weekend"]]
weekend_df = df[df["is_weekend"]]

weekday_hourly = weekday_df.groupby("hour")["collision_id"].count() / weekday_df["crash_date"].nunique()
weekend_hourly = weekend_df.groupby("hour")["collision_id"].count() / weekend_df["crash_date"].nunique()

hourly_avg = pd.DataFrame({
    "hour": range(24),
    "weekday_avg": weekday_hourly.reindex(range(24), fill_value=0).values,
    "weekend_avg": weekend_hourly.reindex(range(24), fill_value=0).values
})

hourly_avg.head()
Out[8]:
hour weekday_avg weekend_avg
0 0 11.052023 15.014493
1 1 4.809249 9.304348
2 2 3.520231 7.681159
3 3 2.907514 7.043478
4 4 3.069364 6.434783
In [9]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=hourly_avg["hour"],
    y=hourly_avg["weekday_avg"],
    name="Weekday Avg",
    mode="lines+markers"
))

fig.add_trace(go.Scatter(
    x=hourly_avg["hour"],
    y=hourly_avg["weekend_avg"],
    name="Weekend Avg",
    mode="lines+markers"
))

fig.update_layout(
    title="Average Crashes by Hour",
    xaxis=dict(
        title="Hour of Day",
        dtick=1,
        tickvals=hourly_avg["hour"]
    ),
    yaxis_title="Avg Crashes per Day",
    template="plotly_white"
)

fig.show()
In [10]:
radar_df = hourly_avg.copy()
radar_df.loc[24] = radar_df.loc[0] # close the loop
radar_df["hour_label"] = radar_df["hour"].astype(int).astype(str)

fig = go.Figure()

fig.add_trace(go.Scatterpolar(
    r=radar_df["weekday_avg"],
    theta=radar_df["hour_label"],
    name="Weekday Avg",
))

fig.add_trace(go.Scatterpolar(
    r=radar_df["weekend_avg"],
    theta=radar_df["hour_label"],
    name="Weekend Avg",
))

fig.update_layout(
    title="Radar Chart - Weekday vs Weekend Hourly Patterns",
    template="plotly_white",
    polar=dict(
        radialaxis=dict(visible=True),
        angularaxis=dict(direction="clockwise")
    )
)

fig.show()

4. Day × Hour Heatmap¶

In [11]:
day_hour = (
    df.groupby(["day_of_week", "hour"], observed=True)["collision_id"]
      .count()
      .reset_index()
      .pivot(index="day_of_week", columns="hour", values="collision_id")
      .reindex(index=weekday_order)  # just to be explicit
)

all_hours = list(range(24))
day_hour = day_hour.reindex(columns=all_hours, fill_value=0)

day_hour
Out[11]:
hour 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
day_of_week
Sunday 510 312 271 254 232 195 182 165 231 241 ... 615 537 566 539 526 462 407 356 365 292
Monday 385 200 126 112 117 142 310 407 591 552 ... 682 690 746 736 658 494 373 325 278 219
Tuesday 333 134 88 96 78 159 294 433 643 576 ... 719 654 799 750 693 509 444 326 339 264
Wednesday 367 154 129 106 103 176 258 407 563 509 ... 741 730 756 743 688 520 402 378 366 259
Thursday 391 144 116 78 118 148 316 445 679 593 ... 722 683 814 778 703 556 454 399 340 283
Friday 436 200 150 111 115 178 315 383 648 638 ... 815 781 870 804 800 628 574 472 430 428
Saturday 526 330 259 232 212 180 193 223 323 330 ... 722 602 668 624 628 569 484 455 439 419

7 rows × 24 columns

In [12]:
fig = px.imshow(
    day_hour,
    labels=dict(
        x="Hour of Day",
        y="Day of Week",
        color="Number of Crashes"
    ),
    x=day_hour.columns,
    y=day_hour.index,
    aspect="auto",
    color_continuous_scale="Reds"
)

fig.update_layout(
    title="Crashes Heatmap - Day of Week × Hour of Day",
    xaxis=dict(
        tickvals=day_hour.columns
    ),
    template="plotly_white"
)

fig.show()

5. High-Risk Locations¶

In [13]:
streets = df[df["on_street_name"].notna()].copy()
streets["on_street_name"] = streets["on_street_name"].str.strip()

top_streets = (
    streets.groupby("on_street_name")
        .agg(total_crashes=("collision_id", "count"))
        .reset_index()
        .sort_values("total_crashes", ascending=False)
        .head(10)
)

# Add % of total collisions
total_crashes_citywide = len(df)
top_streets["pct_of_total"] = round((top_streets["total_crashes"] / total_crashes_citywide) * 100, 2)

top_streets
Out[13]:
on_street_name total_crashes pct_of_total
942 BELT PARKWAY 1241 1.66
2403 LONG ISLAND EXPRESSWAY 745 0.99
1051 BROOKLYN QUEENS EXPRESSWAY 738 0.99
1834 FDR DRIVE 728 0.97
2455 MAJOR DEEGAN EXPRESSWAY 591 0.79
1976 GRAND CENTRAL PKWY 581 0.78
1031 BROADWAY 575 0.77
742 ATLANTIC AVENUE 532 0.71
1322 CROSS BRONX EXPY 526 0.70
1325 CROSS ISLAND PARKWAY 512 0.68
In [14]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=top_streets["pct_of_total"],
    y=top_streets["on_street_name"],
    orientation="h",
    name="Share of Total Collisions (%)",
    text=top_streets["pct_of_total"].astype(str) + "%",
    textposition="outside"
))

fig.update_layout(
    title="Top 10 Streets by Collision Frequency",
    xaxis_title="Share of Total Collisions (%)",
    yaxis_title="Street",
    template="plotly_white",
)

fig.show()

6. Contributing Factors¶

In [15]:
cf = df.copy()

# normalize strings
cf["contributing_factor_vehicle_1"] = (
    cf["contributing_factor_vehicle_1"]
      .fillna("Unspecified")
      .str.strip()
)

# eliminate weird placeholders (Excel often had blanks)
cf["contributing_factor_vehicle_1"] = (
    cf["contributing_factor_vehicle_1"]
        .replace("", "Unspecified")
        .fillna("Unspecified")
)
cf_clean = cf[cf["contributing_factor_vehicle_1"] != "Unspecified"]
In [16]:
cf_all = (
    cf_clean.groupby("contributing_factor_vehicle_1")
      .agg(total_crashes=("collision_id", "count"))
      .reset_index()
      .sort_values("total_crashes", ascending=False)
      .head(10)
)

total_crashes_citywide = len(df)
cf_all["pct_of_total"] = ((cf_all["total_crashes"] / total_crashes_citywide) * 100).astype(int)

cf_all
Out[16]:
contributing_factor_vehicle_1 total_crashes pct_of_total
8 Driver Inattention/Distraction 19123 25
17 Following Too Closely 5202 6
14 Failure to Yield Right-of-Way 4815 6
32 Passing or Lane Usage Improper 2840 3
4 Backing Unsafely 2829 3
31 Passing Too Closely 2687 3
27 Other Vehicular 2233 2
49 Unsafe Speed 2015 2
48 Unsafe Lane Changing 1809 2
46 Traffic Control Disregarded 1544 2
In [17]:
fig = px.treemap(
    cf_all,
    path=["contributing_factor_vehicle_1"],
    values="pct_of_total",
    color="pct_of_total",
    color_continuous_scale="Blues",
    title="Top 10 Contributing Factors - Share of Total Collisions (%)"
)
fig.data[0].texttemplate="%{label}<br>%{value}%"
fig.show()
In [18]:
cf_fatal = (
    cf_clean[cf_clean["is_fatal"]]
      .groupby("contributing_factor_vehicle_1")
      .agg(fatal_crashes=("collision_id", "count"))
      .reset_index()
      .sort_values("fatal_crashes", ascending=False)
)

total_fatal = cf_fatal["fatal_crashes"].sum()
cf_fatal["pct_of_fatal"] = ((cf_fatal["fatal_crashes"] / total_fatal) * 100).astype(int)

cf_fatal.head(10)
Out[18]:
contributing_factor_vehicle_1 fatal_crashes pct_of_fatal
16 Unsafe Speed 33 32
14 Traffic Control Disregarded 15 14
3 Driver Inattention/Distraction 13 12
6 Failure to Yield Right-of-Way 10 9
13 Pedestrian/Bicyclist/Other Pedestrian Error/Co... 6 5
9 Illnes 5 4
4 Driver Inexperience 4 3
17 View Obstructed/Limited 3 2
2 Backing Unsafely 3 2
1 Alcohol Involvement 2 1
In [28]:
fig = px.treemap(
    cf_fatal.head(5),
    path=["contributing_factor_vehicle_1"],
    values="pct_of_fatal",
    color="pct_of_fatal",
    color_continuous_scale="Reds",
    title="Top Contributing Factors - Fatal Collisions (%)"
)
fig.data[0].texttemplate="%{label}<br>%{value}%"
fig.show()

7. Vehicle Types & Victims¶

In [20]:
vt = df.copy()
vt["vehicle_type_code_1"] = vt["vehicle_type_code_1"].fillna("Unspecified").str.strip()
vt_all = (
    vt.groupby("vehicle_type_code_1")
      .agg(total_crashes=("collision_id", "count"))
      .reset_index()
      .sort_values("total_crashes", ascending=False)
)

total_crashes = len(df)
vt_all["pct_of_total"] = round(vt_all["total_crashes"] / total_crashes * 100, 1)

vt_all_clean = vt_all[vt_all["vehicle_type_code_1"] != "Unspecified"]

vt_all_clean.head(10)
Out[20]:
vehicle_type_code_1 total_crashes pct_of_total
179 Sedan 34349 45.9
182 Station Wagon/Sport Utility Vehicle 27541 36.8
196 Taxi 2768 3.7
162 Pick-up Truck 1882 2.5
19 Box Truck 1417 1.9
22 Bus 950 1.3
17 Bike 882 1.2
203 Tractor Truck Diesel 587 0.8
138 Motorcycle 518 0.7
226 Van 430 0.6
In [21]:
vt_fatal = (
    vt[vt["is_fatal"]]
      .groupby("vehicle_type_code_1")
      .agg(fatal_crashes=("collision_id", "count"))
      .reset_index()
      .sort_values("fatal_crashes", ascending=False)
)

total_fatal = vt_fatal["fatal_crashes"].sum()
vt_fatal["pct_of_fatal"] = round(vt_fatal["fatal_crashes"] / total_fatal * 100, 1)

vt_fatal_clean = vt_fatal[vt_fatal["vehicle_type_code_1"] != "Unspecified"]

vt_fatal_clean
Out[21]:
vehicle_type_code_1 fatal_crashes pct_of_fatal
14 Sedan 45 33.3
15 Station Wagon/Sport Utility Vehicle 33 24.4
12 Motorcycle 20 14.8
16 Taxi 5 3.7
4 Bus 4 3.0
13 Pick-up Truck 3 2.2
3 Box Truck 3 2.2
8 E-Scooter 3 2.2
10 Flat Rack 2 1.5
17 Tractor Truck Diesel 2 1.5
6 Convertible 2 1.5
7 Dump 2 1.5
0 AMBULANCE 1 0.7
11 Motorbike 1 0.7
1 Ambulance 1 0.7
9 FORK LIFT 1 0.7
5 Concrete Mixer 1 0.7
2 Bike 1 0.7
19 Van 1 0.7
In [22]:
vt_merge = vt_all_clean.merge(vt_fatal_clean, on="vehicle_type_code_1", how="left")
vt_merge["fatal_crashes"] = vt_merge["fatal_crashes"].fillna(0)
vt_merge["pct_of_fatal"] = vt_merge["pct_of_fatal"].fillna(0)

vt_merge.head(10)
Out[22]:
vehicle_type_code_1 total_crashes pct_of_total fatal_crashes pct_of_fatal
0 Sedan 34349 45.9 45.0 33.3
1 Station Wagon/Sport Utility Vehicle 27541 36.8 33.0 24.4
2 Taxi 2768 3.7 5.0 3.7
3 Pick-up Truck 1882 2.5 3.0 2.2
4 Box Truck 1417 1.9 3.0 2.2
5 Bus 950 1.3 4.0 3.0
6 Bike 882 1.2 1.0 0.7
7 Tractor Truck Diesel 587 0.8 2.0 1.5
8 Motorcycle 518 0.7 20.0 14.8
9 Van 430 0.6 1.0 0.7
In [23]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=vt_merge["vehicle_type_code_1"].head(10),
    y=vt_merge["pct_of_total"].head(10),
    name="% of Total Collisions",
    text=vt_merge["pct_of_total"].round(1).astype(str) + "%",
    textposition="outside"
))

fig.add_trace(go.Bar(
    x=vt_merge["vehicle_type_code_1"].head(10),
    y=vt_merge["pct_of_fatal"].head(10),
    name="% of Fatal Collisions",
    text=vt_merge["pct_of_fatal"].round(1).astype(str) + "%",
    textposition="outside"
))

fig.update_layout(
    title="Vehicle Types Involved — Total vs Fatal Collisions",
    xaxis_title="Vehicle Type",
    yaxis_title="Percentage (%)",
    barmode="group",
    template="plotly_white",
    yaxis=dict(range=[0, max(vt_merge["pct_of_total"].max(), vt_merge["pct_of_fatal"].max()) * 1.2]),
)

fig.show()
In [24]:
injured = pd.DataFrame({
    "category": ["Motorists", "Pedestrians", "Cyclists"],
    "count": [
        df["number_of_motorist_injured"].sum(),
        df["number_of_pedestrians_injured"].sum(),
        df["number_of_cyclist_injured"].sum()
    ]
})
In [25]:
fig = px.pie(
    injured,
    names="category",
    values="count",
    title="Distribution of Injured People",
    color_discrete_sequence=px.colors.sequential.Blues
)

fig.show()
In [26]:
killed = pd.DataFrame({
    "category": ["Motorists", "Pedestrians", "Cyclists"],
    "count": [
        df["number_of_motorist_killed"].sum(),
        df["number_of_pedestrians_killed"].sum(),
        df["number_of_cyclist_killed"].sum()
    ]
})
In [27]:
fig = px.pie(
    killed,
    names="category",
    values="count",
    title="Distribution of Fatalities",
    color_discrete_sequence=px.colors.sequential.Reds
)

fig.show()